#####################################################
# Script to create network graphs of genes shared between GO Slim terms.
# Requires a matrix or data frame (read in from a file) 
# containing Swiss-Pro IDs (Gene_ID), GO domain (GO.domain2), and GO Slim terms (GO.slim).
# (If column names are different, code will have to be changed appropriately.)
# Creates data frames with the number of genes per GO Slim terms and
# the number of genes shared between GO-slim terms and
# uses these matrices to create network graphs using iGraph.
#
# Author: Sonia Singhal, 23 Nov 2012
# singhal@u.washington.edu
#####################################################

library(igraph)

# Function to get a vector of unique GO Slim terms from a data table.
getGOslimNames <- function(GOslim_data){
  return (unique(GOslim_data$GO.slim))
}

# For each GO Slim term, count the number of genes involved
getGenesPerSlim <- function(GOslim_data){
  GOslim <- getGOslimNames(GOslim_data)
  genes_per_slim <- data.frame(GOslim = GOslim, num_genes = rep(0, length(GOslim)))
  for (s in GOslim){
    sub <- subset(GOslim_data, GO.slim == s, select = c(Gene_ID, GO.slim))
    genes_per_slim$num_genes[genes_per_slim$GOslim == s] <- length(unique(sub$Gene_ID))
  }
  return (genes_per_slim)
}

# Create an adjacency matrix from shared genes between GO-slim terms.
makeAdjancencyMat <- function(GOslim_data){
  # Initialize the network adjacency matrix.
  GOslim <- getGOslimNames(GOslim_data)
  GOslim_mat <- matrix(nrow = length(GOslim), ncol = length(GOslim), data = 0)
  dimnames(GOslim_mat)[[1]] <- GOslim
  dimnames(GOslim_mat)[[2]] <- GOslim
  
  # For each gene, count the number of GO-slim terms
  # Insert the values in the matrix
  genes <- unique(GOslim_data$Gene_ID)
  for(g in genes){
    sub <-subset(GOslim_data, Gene_ID == g, select = c(Gene_ID, GO.slim))
    slim_terms <- unique(sub$GO.slim)
    if(length(slim_terms) > 1){
      for(term1 in slim_terms){
        for(term2 in slim_terms){
          if (term1 == term2){
            next
          }
          GOslim_mat[which(GOslim == term1), which(GOslim == term2)] <- GOslim_mat[which(GOslim == term1), which(GOslim == term2)] + 1
        }
      }
    }
  }
  return (GOslim_mat)
}

# Initialize the graph
makeIGraph <- function(GOslim_mat, genes_per_slim){
  g <- graph.adjacency(GOslim_mat, mode = "undirected", weighted = TRUE)
  g <- set.vertex.attribute(g, "name", value = as.character(genes_per_slim$GOslim))
  g <- set.vertex.attribute(g, "number", value = genes_per_slim$num_genes)
  return (g)
}


# Read in data. Filename will need to be changed for a different file/ computer.
filename <- "C:/Users/Sonia/Documents/FHL/ID for DEG in QPX (trimmed).txt"
diff_gene_data <- read.delim(filename, stringsAsFactors = FALSE)
# extract genes involved in biological processes
proc_data <- subset(diff_gene_data, GO.domain2 == "P")
# Remove "other" categories in the GO-slim terms
proc_data <- subset(proc_data, GO.slim != "other biological processes")
proc_data <- subset(proc_data, GO.slim != "other metabolic processes")
# Find the number of genes per GO-slim term
proc_genes_per_slim <- getGenesPerSlim(proc_data)
# Create the adjacency matrix of connections
GOslim_proc_mat <- makeAdjancencyMat(proc_data)
# Initialize the graph
proc_graph <- makeIGraph(GOslim_proc_mat, proc_genes_per_slim)
# add carriage returns into the two longest names
V(proc_graph)$name[2] <- "cell cycle and\nproliferation"
V(proc_graph)$name[3] <- "cell organization\nand biogenesis"

# The figures that appear in the paper were created first 
# by plotting with the tkplot function in iGraph, which allows for
# interactive arrangement of the nodes in the graph.
# The position of each node was obtained by querying the plot for the coordinates
# and tweaked in the basic plotting function (igraph.plot).
# Sample functions:
tkplot(proc_graph, 
       vertex.label = V(proc_graph)$name,
       vertex.size = log(V(proc_graph)$number)*10,
       edge.width = E(proc_graph)$weight, 
       edge.label = E(proc_graph)$weight,
       edge.color = "black")

proc_coords <- tkplot.getcoords(2) # 2 is the number of the particular tkplot.

# Visualize graph in a TIFF file
dev.set(dev.next())
tiff("proc_network.tiff")
plot.igraph(proc_graph,        
            vertex.label = V(proc_graph)$name,
            vertex.size = log(V(proc_graph)$number)*10, # Scale node size by the number of genes
            vertex.label.degree = c(0, 0, 0, pi/2, pi, pi, pi, pi, -pi/2, 0, 0), # Arrange the labels around the circumference of the graph. Starts at "Transport" and moves clockwise
            vertex.label.dist = c(2, 1.3, 1.3, 0.5, 2.3, 1.7, 2, 2, 0.5, 1.1, 1.1), # Move the labels away from the center of each node
            vertex.label.color = "black",
            edge.width = E(proc_graph)$weight, # Change width of connections based on the number of shared genes 
            layout = proc_coords2, # Use the coordinates from the tkplot to arrange nodes
            edge.color = "black")
dev.off()